library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(patchwork)

Load the Weather Data

weather_df = 
  rnoaa::meteo_pull_monitors(
    c("USW00094728", "USC00519397", "USS0023B17S"),
    var = c("PRCP", "TMIN", "TMAX"), 
    date_min = "2017-01-01",
    date_max = "2017-12-31") %>%
  mutate(
    name = recode(
      id, 
      USW00094728 = "CentralPark_NY", 
      USC00519397 = "Waikiki_HA",
      USS0023B17S = "Waterhole_WA"),
    tmin = tmin / 10,
    tmax = tmax / 10) %>%
  select(name, id, everything())
## Registered S3 method overwritten by 'hoardr':
##   method           from
##   print.cache_info httr
## using cached file: ~/Library/Caches/R/noaa_ghcnd/USW00094728.dly
## date created (size, mb): 2022-09-03 15:53:33 (8.394)
## file min/max dates: 1869-01-01 / 2022-09-30
## using cached file: ~/Library/Caches/R/noaa_ghcnd/USC00519397.dly
## date created (size, mb): 2022-09-03 15:53:37 (1.697)
## file min/max dates: 1965-01-01 / 2020-02-29
## using cached file: ~/Library/Caches/R/noaa_ghcnd/USS0023B17S.dly
## date created (size, mb): 2022-09-03 15:53:39 (0.949)
## file min/max dates: 1999-09-01 / 2022-09-30
weather_df
## # A tibble: 1,095 × 6
##    name           id          date        prcp  tmax  tmin
##    <chr>          <chr>       <date>     <dbl> <dbl> <dbl>
##  1 CentralPark_NY USW00094728 2017-01-01     0   8.9   4.4
##  2 CentralPark_NY USW00094728 2017-01-02    53   5     2.8
##  3 CentralPark_NY USW00094728 2017-01-03   147   6.1   3.9
##  4 CentralPark_NY USW00094728 2017-01-04     0  11.1   1.1
##  5 CentralPark_NY USW00094728 2017-01-05     0   1.1  -2.7
##  6 CentralPark_NY USW00094728 2017-01-06    13   0.6  -3.8
##  7 CentralPark_NY USW00094728 2017-01-07    81  -3.2  -6.6
##  8 CentralPark_NY USW00094728 2017-01-08     0  -3.8  -8.8
##  9 CentralPark_NY USW00094728 2017-01-09     0  -4.9  -9.9
## 10 CentralPark_NY USW00094728 2017-01-10     0   7.8  -6  
## # … with 1,085 more rows

Labels

Providing informative axis labels, plot titles, and captions, via labs(title = "", x = "", y = "", caption = "").

weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    caption = "Data from the rnoaa package; temperatures in 2017"
  )
## Warning: Removed 15 rows containing missing values (geom_point).

Scales

We can set scales for x and y axis by using scale_x_continuous(). We can also label them by using labels = c() inside of scale_x_continuous().

weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    caption = "Data from the rnoaa package; temperatures in 2017") + 
  scale_x_continuous(
    breaks = c(-15, 0, 15), 
    labels = c("-15ºC", "0", "15ºC")) +
  scale_y_continuous(
    trans = "sqrt",
    position = "right"
  )
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 90 rows containing missing values (geom_point).

What we did for the x, y-axis scales:

Color Scales

weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    caption = "Data from the rnoaa package; temperatures in 2017") + 
  scale_color_hue(
    name = "Location",
    h = c(100, 300))
## Warning: Removed 15 rows containing missing values (geom_point).

Using name = "xxx" to rename the coloring variance, in this case we use (name = “Location”) to change our variable “name” to “Location”. Note: the variable “name” accidentally shared the same name with the (name = “xxx”) command.

The default hue is “h = c(1, 360)”

Viridis Package for Color Scheme

Trying to create your own color scheme usually doesn’t go well. You can use the “viridis package” instead. There are several options, but the default color scheme works nicely!

Note: viridis::scale_color_viridis() assume the variable you put in is a continuous variable. If we are applying it to a discrete variable (this case, location,) we want to write discrete = TRUE inside the scale_color_viridis command.

weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    caption = "Data from the rnoaa package; temperatures in 2017") + 
  viridis::scale_color_viridis(
    name = "Location",
    discrete = TRUE)
## Warning: Removed 15 rows containing missing values (geom_point).

Themes

Themes are used to modify non-data elements of a plot. They don’t change mappings or how data are render, but control things like background color and location of the the legend. Using themes can help with general plot appearance.

For example, the default legent position is on the right of the graphic. If we want to move it to the bottom, we can use theme(legend.position = "bottom").

weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    caption = "Data from the rnoaa package; temperatures in 2017") + 
  viridis::scale_color_viridis(
    name = "Location",
    discrete = TRUE) +
  theme(legend.position = "bottom")
## Warning: Removed 15 rows containing missing values (geom_point).

Or we can name the plot and add themes.

ggp_temp_plot =
weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    caption = "Data from the rnoaa package; temperatures in 2017") + 
  viridis::scale_color_viridis(
    name = "Location",
    discrete = TRUE)
ggp_temp_plot + 
  theme(legend.position = "bottom")
## Warning: Removed 15 rows containing missing values (geom_point).

Some other theme examples: White background with black box around.

ggp_temp_plot + 
  theme_bw()
## Warning: Removed 15 rows containing missing values (geom_point).

Just white background.

ggp_temp_plot + 
  theme_minimal() +
  theme(legend.position = "bottom")
## Warning: Removed 15 rows containing missing values (geom_point).

ggp_temp_plot + 
  theme_classic()
## Warning: Removed 15 rows containing missing values (geom_point).

In ggthemes there are a lot of themes.

ggp_temp_plot + 
  ggthemes::theme_economist_white()
## Warning: Removed 15 rows containing missing values (geom_point).

ggp_temp_plot + 
  ggthemes::theme_excel()
## Warning: Removed 15 rows containing missing values (geom_point).

ggp_temp_plot + 
  ggthemes::theme_excel_new()
## Warning: Removed 15 rows containing missing values (geom_point).

Setting Opotions

In addition to figure sizing, I include a few other figure preferences in global options declared at the outset of each .Rmd file (this code chunk just gets copy-and-pasted to the beginning of every new file).

library(tidyverse)

knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)

theme_set(theme_minimal() + theme(legend.position = "bottom"))

options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)

scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

What we did in this code chunk:

Data Argument in geom

It’s sometimes necessary to overlay data summaries on a plot of the complete data. Depending on the setting, one way to do this is to create a “summary” dataframe and use that when adding a new geom to a ggplot based on the full data.

We can split “weather_df” into separate datasets for Central Park and Waikiki. Then we use one dataset in the ggplot() and use another in geom_line() inside of ggplot().

Note: we only changed the data set used in geom_line(), it still adopt the global aes() settings.

central_park = 
  weather_df %>% 
  filter(name == "CentralPark_NY")

waikiki = 
  weather_df %>% 
  filter(name == "Waikiki_HA")

ggplot(data = waikiki, aes(x = date, y = tmax, color = name)) + 
  geom_point() + 
  geom_line(data = central_park)
## Warning: Removed 3 rows containing missing values (geom_point).

patchwork

Remember faceting? We’ve seen facetting as an approach to create the “same type of plots” for several levels of a categorical variable.

weather_df %>%
  ggplot(aes(x = tmin, fill = name)) +
  geom_density(alpha = 0.5) +
  facet_grid(. ~name)

What happen when you want multiple plots but cannot facet…? Sometimes, we want to combine two or three fundamentally different plots in the same graphic, such as showing a scatterplot and a boxplot, or show scatterplots illustrating relationships between different variables. In this case, the solution is to create each of the panels you want separately and combine panels using tools in the patchwork package which is loaded in the beginning of this R_Markdown file.

tmax_tmin_p = 
  weather_df %>% 
  ggplot(aes(x = tmax, y = tmin, color = name)) + 
  geom_point(alpha = .5) +
  theme(legend.position = "none")

prcp_dens_p = 
  weather_df %>% 
  filter(prcp > 0) %>% 
  ggplot(aes(x = prcp, fill = name)) + 
  geom_density(alpha = .5) + 
  theme(legend.position = "none")

tmax_date_p = 
  weather_df %>% 
  ggplot(aes(x = date, y = tmax, color = name)) + 
  geom_point(alpha = .5) +
  geom_smooth(se = FALSE) + 
  theme(legend.position = "bottom")

(tmax_tmin_p + prcp_dens_p) / tmax_date_p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Data Manipulation

The behavior of plot depends on the data we have supplied. In some cases, it’s easier to control behavior through data manipulation than it is through the plot code.

This is particularly true for the order of categorical or factor variables in plots (this case, “name” variable.) Categorical variables will be ordered alphabetically. Factors will follow the specified order level that underlies the variable labels. We can change the order level of a factor variable to your specified preference using forcats::fct_relevel or according to the value of another variable using forcats::fct_reorder.

weather_df %>%
  mutate(
    name = factor(name),
    name = forcats::fct_relevel(name, c("Waikiki_HA", "CentralPark_NY", "Waterhole_WA"))) %>% 
  ggplot(aes(x = name, y = tmax)) + 
  geom_violin(aes(fill = name), color = "blue", alpha = .5) + 
  theme(legend.position = "bottom")

Note: changing the categorical “name” to factor, via name = factor(name) is not necessary.

weather_df %>%
  mutate(
    name = forcats::fct_relevel(name, c("Waikiki_HA", "CentralPark_NY", "Waterhole_WA"))) %>% 
  ggplot(aes(x = name, y = tmax)) + 
  geom_violin(aes(fill = name), color = "blue", alpha = .5) + 
  theme(legend.position = "bottom")

We reorders “name” according to tmax values in each “name.”

weather_df %>%
  mutate(
    name = forcats::fct_reorder(name, tmax)) %>% 
  ggplot(aes(x = name, y = tmax)) + 
  geom_violin(aes(fill = name), color = "blue", alpha = .5) + 
  theme(legend.position = "bottom")

Other Examples

Weather_df

What if We wanted density graph for tmin and tmax simultaneously?

We want to facet panels across the “name” variable, and create separate density plots for tmax and tmin in each panel. Unfortunately, weather_df isn’t organized in a way that makes this easy.

One solution would recognize that tmax and tmin are separate observation types of a shared temperature variable. With this understanding, we can shift the data into long format which can be used to make the plot directly:

weather_df %>%
  select(name, tmax, tmin) %>%
  pivot_longer(
    tmax:tmin,
    names_to = "observation",
    values_to = "temperature"
  ) %>%
  ggplot(aes(x = temperature, fill = observation)) +
  geom_density(alpha = 0.5) +
  facet_grid(. ~ name)

If we only want to make the graph for CentralPark_NY, we can fillter the info out from the weather_df first.

weather_df %>%
  filter(name == "CentralPark_NY") %>%
  pivot_longer(
    tmax:tmin,
    names_to = "observation",
    values_to = "temperatures"
  ) %>%
  ggplot(aes(x = temperatures, fill = observation)) +
  geom_density(alpha = 0.5)

PULSE Data

The code below imports and tidies the PULSE data, and creates boxplots showing BDI score across visits. Using pivot_longer to organize the BDI score and visit time variables, and organizing the visit time variable into a factor with an informative ordering.

pulse_data = 
  haven::read_sas("./data/public_pulse_data.sas7bdat") %>%
  janitor::clean_names() %>%
  pivot_longer(
    bdi_score_bl:bdi_score_12m,
    names_to = "visit", 
    names_prefix = "bdi_score_",
    values_to = "bdi") %>%
  select(id, visit, everything()) %>%
  mutate(
    visit = recode(visit, "bl" = "00m"),
    visit = factor(visit, levels = str_c(c("00", "01", "06", "12"), "m"))) %>%
  arrange(id, visit)

ggplot(pulse_data, aes(x = visit, y = bdi, fill = visit)) + 
  geom_boxplot()

Note: visit = factor(visit, levels = str_c(c("00", "01", "06", "12"), "m")) is used to order the factor level. So, we see “00m” at the most left of the graph, and “12m” locates at the most right of the graph.

FAS Data

We’ve seen code for data import and organization and for joining the litters and pups data. Here we add some data tidying steps to view pup-level outcomes (such as, post-natal day on which ears “work”, on which the pup can walk, etc.) across values of dose category and treatment day.

pup_data = 
  read_csv("./data/FAS_pups.csv", col_types = "ciiiii") %>%
  janitor::clean_names() %>%
  mutate(sex = recode(sex, `1` = "male", `2` = "female")) 

litter_data = 
  read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
  janitor::clean_names() %>%
  separate(group, into = c("dose", "day_of_tx"), sep = 3)

fas_data = left_join(pup_data, litter_data, by = "litter_number") 

fas_data %>% 
  select(sex, dose, day_of_tx, pd_ears:pd_walk) %>% 
  pivot_longer(
    pd_ears:pd_walk,
    names_to = "outcome", 
    values_to = "pn_day") %>% 
  drop_na() %>% 
  mutate(outcome = forcats::fct_reorder(outcome, pn_day, median)) %>% 
  ggplot(aes(x = dose, y = pn_day)) + 
  geom_violin() + 
  facet_grid(day_of_tx ~ outcome)

Note:

  1. select(sex, dose, day_of_tx, pd_ears:pd_walk) can be achieved by select(sex, dose, day_of_tx, starts_with("pd_)) too.

  2. forcats::fct_reorder(outcome, pn_day, median) let R reorder “outcome” based on the median “pn_day”. It can also be achieved, through a less general way, via forcats::fct_reorder(outcome, "pd_ears", "pd_pivot", "pd_walk", "pd_eyes").